# coding:utf-8
'''
author:wangyi
'''

import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm
import time


def get_download_result(journalname,volume,issue):
    '''
    获取详情信息
    :param journalname: 期刊名
    :param volume: 期刊号
    :param issue: 卷号
    :return: 指定期刊号和卷号下的所有数据
    '''
    url = 'https://koreamed.org/search/result?q=(journal:%22'+str(journalname)+'%22+AND+volume:'+str(volume)+')+AND+issue:'+str(issue)+'&resultsPerPage=9999&page=1&display=Summary&sort=Date'
    print(url)
    req = requests.get(url)
    resps = req.json()['results']['data']
    return resps


def main():
    requests.adapters.DEFAULT_RETRIES = 5
    # 获取期刊列表的所有期刊名称及其对应网页跳转id
    source_src = requests.get('https://koreamed.org/journals')
    soup = BeautifulSoup(source_src.text,'lxml')
    journallinks = soup.findAll(name="a", attrs={"class" :"DefaultJournalItemLink"})
    journalnames = soup.findAll(name="span", attrs={"class" :"journalsubinfo"})
    journalname2page = {}
    for i,item in enumerate(journalnames[:2]):
        journalname2page[item.text.split('|')[0].strip()] = journallinks[2*i]['href'].split('/')[1].strip()
    journalname2volumn = defaultdict(list)
    # 获取每个期刊下的期刊号和卷号
    for k,v in tqdm(journalname2page.items()):
        detail = requests.get('https://koreamed.org/volumes/'+v)
        soup = BeautifulSoup(detail.text,'lxml')
        volums = soup.findAll(name="div", attrs={"class" :"text-md-center"})
        for volum in volums:
            if volum.contents[1]['href'].find(v) != -1:
                m = {'volume':volum.contents[1]['href'].split('/')[-2].strip(),'issue':volum.contents[1]['href'].split('/')[-1].strip()}
                if m not in journalname2volumn[k]:
                    journalname2volumn[k].append(m)
    # 循环调用详情接口并按照期刊名称写入文件
    for journalname,volumn in journalname2volumn.items():
        writer = open('./crawer_results/'+journalname+'.txt','w',encoding='utf-8')
        count = 1
        for vi in tqdm(volumn):
            for r in get_download_result('+'.join(journalname.split(' ')),vi['volume'],vi['issue']):
                writer.write(str(count)+':'+r['publishinfo']+'\n')
                writer.write('\t'+'IS\t-'+r['pissn']+'(Print)'+'\n')
                writer.write('\t' + 'IS\t-' + r['eissn']+'(Eletronic)'+'\n')
                writer.write('\t' + 'VI\t-' + str(r['volume'])+'\n')
                writer.write('\t' + 'IP\t-' + str(r['issue']) + '\n')
                writer.write('\t' + 'TI\t-' + r['title'] + '\n')
                writer.write('\t' + 'DP\t-' + r['publishinfo'].split('.')[1].split(';')[0] + '\n')
                writer.write('\t' + 'PG\t-' + r['publishinfo'].split('.')[1].split(';')[1].split(':')[1] + '\n')
                writer.write('\t' + 'DOI\t-' + r['doi'] + '\n')
                writer.write('\t' + 'AB\t-' + r['abstract'] + '\n')
                for i in range(len(r['author_facet'])):
                    writer.write('\t' + 'FAU\t-' + r['author_facet'][i] + '\n')
                    writer.write('\t' + 'AU\t-' + r['author_initial'][i] + '\n')
                for i in range(len(r['affiliate_facet'])):
                    writer.write('\t' + 'AD\t-' + r['affiliate_facet'][i] + '\n')
                writer.write('\t' + 'LA\t-' + r['language'] + '\n')
                writer.write('\t' + 'PT\t-' + r['article_type'] + '\n')
                writer.write('\t' + 'TA\t-' + r['journal_id_nlm_ta'] + '\n')
                writer.write('\t' + 'DE\t-' + r['accepted_date'] + '\n')
                writer.write('\t' + 'KUID\t-' + r['id'] + '\n')
                writer.write('\t' + 'AID\t-' + r['doi'] + '[doi]\n')
                writer.write('\t' + 'SO\t-' + r['publishinfo'] + '\n')
                writer.write('\n')
                writer.flush()
                count += 1
            # 防止请求频繁
            time.sleep(5)
        writer.close()
if __name__ == '__main__':
    
    #get_download_result('+'.join('Acute Crit Care'.split(' ')),'34','1')
    main()